# ========================================================================= #
# Project: Lexical Ambiguity in Political Rhetoric (BJPolS)
# - Script: Compare sentence embeddings of moral foundation terms
# - Author: Patrick Kraft (patrickwilli.kraft@uc3m.es)
# ========================================================================= #


# Load packages and custom functions --------------------------------------

source(here::here("code/00-func.R"))


# Load sentences & embeddings ---------------------------------------------

df <- read_csv(here("out/sentences.csv")) %>%
  bind_cols(read_csv(here("out/embeddings.csv"), col_names = FALSE)) %>%
  bind_cols(
    corpus(.$sentence) %>%
      tokens() %>% 
      tokens_lookup(dictionary = dictionary(file=here("in/mfd2.0.dic"), format="LIWC")) %>%
      dfm() %>% 
      convert("data.frame") %>%
      transmute(Care = as.numeric((care.virtue + care.vice)>0), 
                Fairness = as.numeric((fairness.virtue + fairness.vice)>0), 
                Loyalty = as.numeric((loyalty.virtue + loyalty.vice)>0), 
                Authority = as.numeric((authority.virtue + authority.vice)>0), 
                Sanctity = as.numeric((sanctity.virtue + sanctity.vice)>0))) %>%
  filter((Care + Fairness + Loyalty + Authority + Sanctity)>0 &
           party != "Liberal") %>%
  mutate(party_group = recode_factor(party, 
                                     `Republican` = "Rep/Con",
                                     `Conservative` = "Rep/Con",
                                     `Democratic` = "Dem/Lab",
                                     `Labour` = "Dem/Lab"))


# Compute between vs. within party cosine similarities --------------------
## (for each sentence within type and foundation)

df_cosine <- df %>% 
  select(type, doc_id, year, party, speaker) %>% 
  filter(!duplicated(doc_id)) %>% 
  left_join(
    df %>%
      filter(Care == 1) %>%
      split(.$type) %>%
      map(cosineComparison) %>% 
      map_dfr(rename, 
              Care = avg_cosine_diff, 
              .id = "type")
  ) %>% 
  left_join(
    df %>%
      filter(Fairness == 1) %>%
      split(.$type) %>%
      map(cosineComparison) %>% 
      map_dfr(rename, 
              Fairness = avg_cosine_diff, 
              .id = "type")
  ) %>% 
  left_join(
    df %>%
      filter(Loyalty == 1) %>%
      split(.$type) %>%
      map(cosineComparison) %>% 
      map_dfr(rename, 
              Loyalty = avg_cosine_diff, 
              .id = "type")
  ) %>% 
  left_join(
    df %>%
      filter(Authority == 1) %>%
      split(.$type) %>%
      map(cosineComparison) %>% 
      map_dfr(rename, 
              Authority = avg_cosine_diff, 
              .id = "type")
  ) %>% 
  left_join(
    df %>%
      filter(Sanctity == 1) %>%
      split(.$type) %>%
      map(cosineComparison) %>% 
      map_dfr(rename, 
              Sanctity = avg_cosine_diff, 
              .id = "type")
  )

## rescaling
df_cosine <- df_cosine %>% 
  pivot_longer(Care:Sanctity) %>% 
  group_by(type, name) %>% 
  mutate(value = value/sd(value, na.rm = T)) %>% 
  pivot_wider()


# Bootstrap CIs for cosine differences ------------------------------------

set.seed(42)
nboot <- 250

plot_df <- bind_rows(
  df_cosine %>% 
    filter(type == "a) US State of the Union") %>%
    bootstraps(nboot, strata = party) %>%
    mutate(splits = map(splits, cosineBoot)) %>%
    unnest(cols = splits) %>%
    group_by(foundation) %>%
    summarize(average = mean(avg, na.rm = T),
              cilo95 = quantile(avg, probs = .025, na.rm = T),
              cihi95 = quantile(avg, probs = .975, na.rm = T),
              cilo90 = quantile(avg, probs = .05, na.rm = T),
              cihi90 = quantile(avg, probs = .95, na.rm = T)) %>% 
    mutate(type = "a) US State of the Union"),
  
  df_cosine %>% 
    filter(type == "b) UK Queen's Speeches") %>%
    bootstraps(nboot, strata = party) %>%
    mutate(splits = map(splits, cosineBoot)) %>%
    unnest(cols = splits) %>%
    group_by(foundation) %>%
    summarize(average = mean(avg, na.rm = T),
              cilo95 = quantile(avg, probs = .025, na.rm = T),
              cihi95 = quantile(avg, probs = .975, na.rm = T),
              cilo90 = quantile(avg, probs = .05, na.rm = T),
              cihi90 = quantile(avg, probs = .95, na.rm = T)) %>% 
    mutate(type = "b) UK Queen's Speeches"),
  
  df_cosine %>% 
    filter(type == "c) US Convention Speeches") %>%
    bootstraps(nboot, strata = party) %>%
    mutate(splits = map(splits, cosineBoot)) %>%
    unnest(cols = splits) %>%
    group_by(foundation) %>%
    summarize(average = mean(avg, na.rm = T),
              cilo95 = quantile(avg, probs = .025, na.rm = T),
              cihi95 = quantile(avg, probs = .975, na.rm = T),
              cilo90 = quantile(avg, probs = .05, na.rm = T),
              cihi90 = quantile(avg, probs = .95, na.rm = T)) %>% 
    mutate(type = "c) US Convention Speeches"),
  
  df_cosine %>% 
    filter(type == "d) UK Party Leader Speeches") %>%
    bootstraps(nboot, strata = party) %>%
    mutate(splits = map(splits, cosineBoot)) %>%
    unnest(cols = splits) %>%
    group_by(foundation) %>%
    summarize(average = mean(avg, na.rm = T),
              cilo95 = quantile(avg, probs = .025, na.rm = T),
              cihi95 = quantile(avg, probs = .975, na.rm = T),
              cilo90 = quantile(avg, probs = .05, na.rm = T),
              cihi90 = quantile(avg, probs = .95, na.rm = T)) %>% 
    mutate(type = "d) UK Party Leader Speeches"),
  
  df_cosine %>% 
    filter(type == "e) US Presidential Debates") %>%
    bootstraps(nboot, strata = party) %>%
    mutate(splits = map(splits, cosineBoot)) %>%
    unnest(cols = splits) %>%
    group_by(foundation) %>%
    summarize(average = mean(avg, na.rm = T),
              cilo95 = quantile(avg, probs = .025, na.rm = T),
              cihi95 = quantile(avg, probs = .975, na.rm = T),
              cilo90 = quantile(avg, probs = .05, na.rm = T),
              cihi90 = quantile(avg, probs = .95, na.rm = T)) %>% 
    mutate(type = "e) US Presidential Debates"),
  
  df_cosine %>%
    filter(type == "f) US Senate Emails") %>%
    bootstraps(nboot, strata = party) %>%
    mutate(splits = map(splits, cosineBoot)) %>%
    unnest(cols = splits) %>%
    group_by(foundation) %>%
    summarize(average = mean(avg, na.rm = T),
              cilo95 = quantile(avg, probs = .025, na.rm = T),
              cihi95 = quantile(avg, probs = .975, na.rm = T),
              cilo90 = quantile(avg, probs = .05, na.rm = T),
              cihi90 = quantile(avg, probs = .95, na.rm = T)) %>%
    mutate(type = "f) US Senate Emails")
) 

plot_df %>%
  mutate(foundation = factor(foundation, 
                             levels = rev(c("Care", "Fairness", "Loyalty", 
                                            "Authority", "Sanctity"))),
         type = factor(type,
                       levels = c("a) US State of the Union",
                                  "c) US Convention Speeches",
                                  "e) US Presidential Debates",
                                  "b) UK Queen's Speeches",
                                  "d) UK Party Leader Speeches",
                                  "f) US Senate Emails"),
                       labels = c("a) US State of the Union (N = 80)",
                                  "c) US Convention Speeches (N = 39)",
                                  "e) US Presidential Debates (N = 84)",
                                  "b) UK Queen's Speeches (N = 65)",
                                  "d) UK Party Leader Speeches (N = 165)",
                                  "f) US Senate Emails (N = 1000)"))) %>%
  filter(type != "e) US Presidential Debates (N = 84)",
         type != "f) US Senate Emails (N = 1000)") %>% 
  ggplot(aes(x = foundation, y = average, 
             ymin = cilo95, ymax = cihi95)) +
  geom_hline(yintercept = 0, col = "darkgrey") +
  geom_point(size = 1.5) +
  geom_errorbar(width=0) +
  geom_errorbar(aes(ymin = cilo90, ymax = cihi90), width=.2) +
  facet_wrap(.~type) +
  labs(y = "Average difference in cosine similarities (within parties - between parties)", 
       x = NULL) +
  coord_flip() +
  theme_mft()
ggsave(here("out/fig02-sentence_cosines.png"), height = 5, width = 6, dpi = 600)

plot_df %>%
  mutate(foundation = factor(foundation, 
                             levels = rev(c("Care", "Fairness", "Loyalty", 
                                            "Authority", "Sanctity"))),
         type = factor(type,
                       levels = c("a) US State of the Union",
                                  "c) US Convention Speeches",
                                  "e) US Presidential Debates",
                                  "b) UK Queen's Speeches",
                                  "d) UK Party Leader Speeches",
                                  "f) US Senate Emails"))) %>%
  filter(type == "e) US Presidential Debates") %>% 
  ggplot(aes(x = foundation, y = average, 
             ymin = cilo95, ymax = cihi95)) +
  geom_hline(yintercept = 0, col = "darkgrey") +
  geom_point(size = 1.5) +
  geom_errorbar(width=0) +
  geom_errorbar(aes(ymin = cilo90, ymax = cihi90), width=.2) +
  labs(y = "Average difference in cosine similarities\n(within parties - between parties)", 
       x = NULL,
       title = "Differences in Moral Context") +
  coord_flip() +
  theme_mft()
ggsave(here("out/fig04b-sentence_cosines_debates.png"), height = 2.5, width = 3, dpi = 600)

plot_df %>%
  mutate(foundation = factor(foundation, 
                             levels = rev(c("Care", "Fairness", "Loyalty", 
                                            "Authority", "Sanctity"))),
         type = factor(type,
                       levels = c("a) US State of the Union",
                                  "c) US Convention Speeches",
                                  "e) US Presidential Debates",
                                  "b) UK Queen's Speeches",
                                  "d) UK Party Leader Speeches",
                                  "f) US Senate Emails"))) %>%
  filter(type == "f) US Senate Emails") %>% 
  ggplot(aes(x = foundation, y = average, 
             ymin = cilo95, ymax = cihi95)) +
  geom_hline(yintercept = 0, col = "darkgrey") +
  geom_point(size = 1.5) +
  geom_errorbar(width=0) +
  geom_errorbar(aes(ymin = cilo90, ymax = cihi90), width=.2) +
  labs(y = "Average difference in cosine similarities\n(within parties - between parties)", 
       x = NULL,
       title = "Differences in Moral Context") +
  coord_flip() +
  theme_mft()
ggsave(here("out/fig05b-sentence_cosines_emails.png"), height = 2.5, width = 3, dpi = 600)


# Compute difference in cosine similarities over time ---------------------

df_cosine %>% 
  mutate(time = case_when(
    year < 1960 ~ "Before 1959",
    year >= 1960 & year < 1980 ~ "1960 to 1979",
    year >= 1980 & year < 2000 ~ "1980 to 1999",
    year >= 2000 ~ "2000 to 2020"),
    time = factor(time, levels = c("Before 1959","1960 to 1979","1980 to 1999","2000 to 2020"))
  ) %>% 
  group_by(time, type) %>% 
  summarize(across(Care:Sanctity, mean)) %>% 
  pivot_longer(Care:Sanctity) %>% 
  mutate(foundation = factor(name, 
                             levels = c("Care", "Fairness", "Loyalty", 
                                        "Authority", "Sanctity")),
         type = factor(type,
                       levels = c("a) US State of the Union",
                                  "c) US Convention Speeches",
                                  "e) US Presidential Debates",
                                  "b) UK Queen's Speeches",
                                  "d) UK Party Leader Speeches",
                                  "f) US Senate Emails"),
                       labels = c("a) US State of the Union (N = 80)",
                                  "c) US Convention Speeches (N = 39)",
                                  "e) US Presidential Debates (N = 84)",
                                  "b) UK Queen's Speeches (N = 65)",
                                  "d) UK Party Leader Speeches (N = 165)",
                                  "f) US Senate Emails (N = 1000)"))) %>%
  filter(type != "e) US Presidential Debates (N = 84)",
         type != "f) US Senate Emails (N = 1000)") %>% 
  ggplot(aes(x = time, y = value, col=foundation, lty=foundation, group = foundation, shape = foundation)) +
  labs(x = NULL, y="Average difference in cosine similarities\n(within parties - between parties)") +
  scale_colour_brewer(palette = "Dark2") +
  geom_point() +
  geom_line() +
  geom_hline(yintercept = 0, col = "darkgrey") +
  facet_wrap(~type) +
  theme_mft()
ggsave(here("out/fig03-time_cosines.png"), height = 5, width = 8, dpi = 600)
